# Load needed packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(colorspace)
library(bipartite)
## Warning: package 'bipartite' was built under R version 4.3.3
## Loading required package: vegan
## Warning: package 'vegan' was built under R version 4.3.3
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-6.1
## Loading required package: sna
## Warning: package 'sna' was built under R version 4.3.3
## Loading required package: statnet.common
##
## Attaching package: 'statnet.common'
## The following objects are masked from 'package:base':
##
## attr, order
## Loading required package: network
##
## 'network' 1.18.2 (2023-12-04), part of the Statnet Project
## * 'news(package="network")' for changes since last version
## * 'citation("network")' for citation information
## * 'https://statnet.org' for help, support, and other information
## sna: Tools for Social Network Analysis
## Version 2.8 created on 2024-09-07.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
## For citation information, type citation("sna").
## Type help(package="sna") to get started.
## This is bipartite 2.20.
## For latest changes see versionlog in ?"bipartite-package". For citation see: citation("bipartite").
## Have a nice time plotting and analysing two-mode networks.
##
## Attaching package: 'bipartite'
## The following object is masked from 'package:vegan':
##
## nullmodel
library(tidyr)
library(readr)
library(vegan)
# setwd("/Users/shirn/OneDrive/Documents/master/data")
setwd("/Users/shirnehoray/EDA/plant_pollinators_prediction/EDA_Eupoll")
# Read the data
data <- read.csv("Interaction_data.csv", encoding = "latin1")
# Get unique species lists for each network
species_per_network <- data %>%
group_by(Bioregion, Network_id) %>%
summarise(
Plants = list(unique(Plant_accepted_name)),
Pollinators = list(unique(Pollinator_accepted_name)),
.groups = "drop"
)
# Function to calculate Jaccard similarity
jaccard_similarity <- function(set1, set2) {
inter <- length(intersect(set1, set2))
union <- length(union(set1, set2))
if (union == 0) return(NA) else return(inter / union)
}
# Loop over each bioregion and create heatmap for each one
bioregions <- unique(species_per_network$Bioregion)
# For plants species
for (bio in bioregions) {
cat("Plotting Bioregion:", bio, "\n")
sub_data <- species_per_network %>% filter(Bioregion == bio)
nets <- sub_data$Network_id
# Build similarity matrix
mat <- matrix(NA, nrow = length(nets), ncol = length(nets),
dimnames = list(nets, nets))
for (i in seq_along(nets)) {
for (j in seq_along(nets)) {
mat[i,j] <- jaccard_similarity(sub_data$Plants[[i]], sub_data$Plants[[j]])
}
}
# Turn into data frame
mat_df <- as.data.frame(as.table(mat))
# Plot heatmap
p <- ggplot(mat_df, aes(Var1, Var2, fill = Freq)) +
geom_tile() +
scale_fill_continuous_sequential(palette = "Viridis",) +
theme_minimal() +
labs(title = paste("Plant species similarity -", bio),
x = "Network_id", y = "Network_id", fill = "Jaccard Index") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 3),
axis.text.y = element_text(size = 3))
print(p)
}
## Plotting Bioregion: Alpine

## Plotting Bioregion: Atlantic

## Plotting Bioregion: Boreal

## Plotting Bioregion: Continental

## Plotting Bioregion: Mediterranean

## Plotting Bioregion: Pannonian

## Plotting Bioregion: Steppic

# For pollinators species
for (bio in bioregions) {
cat("Plotting Bioregion:", bio, "\n")
sub_data <- species_per_network %>% filter(Bioregion == bio)
nets <- sub_data$Network_id
# Build similarity matrix
mat <- matrix(NA, nrow = length(nets), ncol = length(nets),
dimnames = list(nets, nets))
for (i in seq_along(nets)) {
for (j in seq_along(nets)) {
mat[i,j] <- jaccard_similarity(sub_data$Pollinators[[i]], sub_data$Pollinators[[j]])
}
}
# Turn into data frame
mat_df <- as.data.frame(as.table(mat))
# Plot heatmap
p <- ggplot(mat_df, aes(Var1, Var2, fill = Freq)) +
geom_tile() +
scale_fill_continuous_sequential(palette = "Viridis") +
theme_minimal() +
labs(title = paste("Pollinators species similarity -", bio),
x = "Network_id", y = "Network_id", fill = "Jaccard Index") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 3),
axis.text.y = element_text(size = 3))
print(p)
}
## Plotting Bioregion: Alpine

## Plotting Bioregion: Atlantic

## Plotting Bioregion: Boreal

## Plotting Bioregion: Continental

## Plotting Bioregion: Mediterranean

## Plotting Bioregion: Pannonian

## Plotting Bioregion: Steppic

bioregions <- c("Atlantic", "Boreal", "Continental", "Alpine", "Mediterranean", "Steppic", "Pannonian")
min_total <- 15 # minimum total species per network
top_n_pairs <- 100 # how many top pairs to keep (optional)
## =========Utilities===========
normalize_name <- function(x) {
# Optional: standardize species names (trim, collapse spaces, lower-case)
x <- trimws(x)
x <- gsub("\\s+", " ", x)
tolower(x)
}
jaccard_sets <- function(a, b) {
a <- unique(a); b <- unique(b)
ia <- length(intersect(a, b))
ua <- length(union(a, b))
if (ua == 0) NA_real_ else ia / ua
}
## ========= Loop for each bioregions===========
for (bio in bioregions) {
cat(sprintf("\n=== Processing Bioregion: %s ===\n", bio))
## ======== Set Columns ==========
req_cols <- c("Bioregion", "Network_id", "Plant_accepted_name", "Pollinator_accepted_name")
missing_cols <- setdiff(req_cols, names(data))
if (length(missing_cols) > 0) {
cat(paste("Missing required columns:", paste(missing_cols, collapse = ", "), "\n"))
next
}
## ========= Set the chosen bioregion ========
dat_bio <- data[data$Bioregion == bio, , drop = FALSE]
if (nrow(dat_bio) == 0) {
cat(paste("No rows for Bioregion:", bio, "\n"))
next
}
## ===== species counts per network (to filter small networks) ====
# Distinct plants per network
plants_by_net <- aggregate(Plant_accepted_name ~ Network_id, data = dat_bio,
FUN = function(x) length(unique(x)))
names(plants_by_net)[2] <- "n_plants"
# Distinct pollinators per network
polls_by_net <- aggregate(Pollinator_accepted_name ~ Network_id, data = dat_bio,
FUN = function(x) length(unique(x)))
names(polls_by_net)[2] <- "n_pollinators"
# Merge counts and compute total
counts <- merge(plants_by_net, polls_by_net, by = "Network_id", all = TRUE)
counts$n_plants[is.na(counts$n_plants)] <- 0
counts$n_pollinators[is.na(counts$n_pollinators)] <- 0
counts$total_species <- counts$n_plants + counts$n_pollinators
# Keep networks meeting the threshold
nets_keep <- counts$Network_id[counts$total_species >= min_total]
if (length(nets_keep) == 0) {
cat(paste("No networks with total_species >=", min_total, "in", bio, "\n"))
next
}
## ====== Build species sets per network (plants + pollinators together) =====
# Subset to kept networks
dat_bio_sub <- dat_bio[dat_bio$Network_id %in% nets_keep, , drop = FALSE]
# Normalize names (optional but recommended)
plant_names <- normalize_name(dat_bio_sub$Plant_accepted_name)
poll_names <- normalize_name(dat_bio_sub$Pollinator_accepted_name)
# Tag guilds to avoid accidental collisions
plant_tags <- paste0("PLANT::", plant_names)
poll_tags <- paste0("POLL::", poll_names)
# Split row indices by Network_id so we can collect species per network
idx_by_net <- split(seq_len(nrow(dat_bio_sub)), dat_bio_sub$Network_id)
# For each network, take the union of unique plant & pollinator tags
species_items <- lapply(idx_by_net, function(idx) {
unique(c(plant_tags[idx], poll_tags[idx]))
})
# 'species_items' is a named list: names(species_items) are Network_id values
net_ids <- names(species_items)
n <- length(net_ids)
## ======== Pairwise Jaccard matrix =====
cat(sprintf("Number of networks for %s: %d\n", bio, n)) # Diagnostic
sim_mat <- matrix(NA_real_, nrow = n, ncol = n, dimnames = list(net_ids, net_ids))
for (i in seq_len(n)) {
# diagonal
sim_mat[i, i] <- NA_real_
# upper triangle only
if (i < n) {
for (j in (i + 1L):n) {
val <- jaccard_sets(species_items[[i]], species_items[[j]])
sim_mat[i, j] <- val
sim_mat[j, i] <- val
}
}
}
## ===== Tidy table of unique pairs =====
# Convert matrix to data.frame of pairs
# keep only i<j to avoid duplicates
pairs_list <- list()
k <- 0L
for (i in seq_len(n)) {
if (i < n) {
for (j in (i + 1L):n) {
k <- k + 1L
pairs_list[[k]] <- data.frame(
Net1 = net_ids[i],
Net2 = net_ids[j],
Jaccard = sim_mat[i, j],
stringsAsFactors = FALSE
)
}
}
}
sim_df <- do.call(rbind, pairs_list)
if (is.null(sim_df) || nrow(sim_df) == 0 || !("Jaccard" %in% names(sim_df)) || all(is.na(sim_df$Jaccard))) {
cat(sprintf("No valid Jaccard similarities for %s (empty or all NA).\n", bio))
next
}
# Remove NA and sort descending
sim_df <- sim_df[!is.na(sim_df$Jaccard), , drop = FALSE]
ord <- order(sim_df$Jaccard, decreasing = TRUE)
sim_df <- sim_df[ord, , drop = FALSE]
# Keep top N pairs (optional)
if (!is.null(top_n_pairs) && is.finite(top_n_pairs)) {
top_n <- min(top_n_pairs, nrow(sim_df))
sim_top <- sim_df[seq_len(top_n), , drop = FALSE]
} else {
sim_top <- sim_df
}
## ======Results =======
cat(sprintf("\n=== Top similar network pairs for %s ===\n", bio))
print(head(sim_top, 20), row.names = FALSE)
## ======= Heatmap of Jaccard similarities ======
# Convert sim_mat to long format for ggplot
sim_mat_long <- as.data.frame.table(sim_mat, responseName = "Jaccard")
names(sim_mat_long)[1:2] <- c("Net1", "Net2")
sim_mat_long <- sim_mat_long[!is.na(sim_mat_long$Jaccard), ]
# # Create heatmap
# p_heatmap <- ggplot(sim_mat_long, aes(x = Net1, y = Net2, fill = Jaccard)) +
# geom_tile() +
# scale_fill_continuous_sequential(palette = "Viridis", name = "Jaccard Similarity") +
# labs(
# title = sprintf("Jaccard Similarity Heatmap for %s", bio),
# x = "Network ID", y = "Network ID"
# ) +
# theme_minimal() +
# theme(
# axis.text.x = element_text(angle = 75, hjust = 1, size = 3),
# axis.text.y = element_text(size = 3),
# panel.grid = element_blank()
# )
#
# print(p_heatmap)
## ===== Collect per-network coordinates for the networks in top pairs ========
# unique networks from the top pairs
unique_nets <- unique(c(sim_top$Net1, sim_top$Net2))
# subset interactions to the chosen bioregion and those networks
dat_map <- dat_bio_sub[dat_bio_sub$Network_id %in% unique_nets, , drop = FALSE]
if (!all(c("Latitude","Longitude") %in% names(dat_map))) {
cat("Latitude/Longitude columns are missing in the dataset.\n")
next
}
# Ensure numeric (safely)
dat_map$Latitude <- suppressWarnings(as.numeric(dat_map$Latitude))
dat_map$Longitude <- suppressWarnings(as.numeric(dat_map$Longitude))
# remove rows with missing coordinates
dat_map <- dat_map[!is.na(dat_map$Latitude) & !is.na(dat_map$Longitude), , drop = FALSE]
if (nrow(dat_map) == 0) {
cat("No valid coordinates to plot.\n")
next
}
## For each Network_id, take a single representative lat/lon (first unique non-NA)
first_unique <- function(x) {
ux <- unique(x)
ux[!is.na(ux)][1]
}
# get the location for each network
# aggregate coordinates per network
lat_by_net <- aggregate(Latitude ~ Network_id, data = dat_map, FUN = first_unique)
lon_by_net <- aggregate(Longitude ~ Network_id, data = dat_map, FUN = first_unique)
# merge back
net_locs <- merge(lat_by_net, lon_by_net, by = "Network_id", all = TRUE)
# add total species to size points (we already computed 'counts' earlier)
net_locs <- merge(net_locs, counts[, c("Network_id","total_species")], by = "Network_id", all.x = TRUE)
# keep only networks we actually plot
net_locs <- net_locs[net_locs$Network_id %in% unique_nets, , drop = FALSE]
## ======Determine map extents ========
lat_range <- range(net_locs$Latitude, na.rm = TRUE)
lon_range <- range(net_locs$Longitude, na.rm = TRUE)
lat_buffer <- 2
lon_buffer <- 3
xlim <- c(lon_range[1] - lon_buffer, lon_range[2] + lon_buffer)
ylim <- c(lat_range[1] - lat_buffer, lat_range[2] + lat_buffer)
## ====== Base map (Europe) and plot ============
europe <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf", continent = "Europe")
# simple palette for distinct networks
n_cols <- length(unique(net_locs$Network_id))
cols <- grDevices::hcl.colors(n_cols, palette = "Spectral", rev = TRUE)
p_map <- ggplot(europe) +
geom_sf(fill = "beige", color = "black", linewidth = 0.2) +
geom_point(
data = net_locs,
aes(x = Longitude, y = Latitude, color = Network_id, size = total_species),
alpha = 0.8
) +
scale_color_manual(values = cols, guide = "none") +
scale_size_continuous(range = c(2, 8), name = "Total Species") +
coord_sf(xlim = xlim, ylim = ylim, expand = FALSE, default_crs = NULL) +
labs(
title = sprintf("Top similar networks in %s ", bio),
subtitle = sprintf("Points sized by total species",
lat_range[1], lat_range[2]),
x = "Longitude", y = "Latitude", color = "Network_id"
) +
theme_minimal() +
theme(legend.position = "bottom")
print(p_map)
}
##
## === Processing Bioregion: Atlantic ===
## Number of networks for Atlantic: 215
##
## === Top similar network pairs for Atlantic ===
## Net1
## MG11_E1_BC1
## OS24_E2_BC1
## Creehaun.Carron.Co.Clare_4.2018_GA1.GS1.WS1
## CG43_E2_GA1
## MT4_E1_BC1
## Chize_14_Grassland
## MG18_E1_BC1
## CG43_E1_GA1
## Chize_24_Grassland
## Chateau_Gaillard_2016
## Chize_12_Field_boundary
## Chize_17_Field_boundary
## Chize_5_Field_boundary
## Larris_2016
## Chize_1_Field_boundary
## Chize_1_Field_boundary
## CT32_E1_BC1
## Tibarney_3_GS1
## Carrownalassan_3_GS1
## CG43_E1_GA1
## Net2 Jaccard
## MG11_E2_BC1 0.6071429
## OS26_E1_BC1 0.5909091
## Derreenatloghtan.Co.Clare_1.2017_GS1 0.5652174
## MG18_E2_BC1 0.5454545
## MT4_E2_BC1 0.5454545
## Chize_25_Grassland 0.5434783
## MG18_E2_BC1 0.5416667
## CG43_E2_GA1 0.5217391
## Chize_25_Grassland 0.5094340
## Falaises_2016 0.5086957
## Chize_14_Field_boundary 0.5000000
## Chize_5_Field_boundary 0.5000000
## Chize_9_Field_boundary 0.5000000
## Riez_2016 0.4879518
## Chize_19_Field_boundary 0.4791667
## Chize_1_Grassland 0.4782609
## MT1_E2_BC1 0.4782609
## Turrock_6_GS1 0.4666667
## Tibarney_3_GS1 0.4642857
## MG11_E1_BC1 0.4642857
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'
##
## === Processing Bioregion: Boreal ===
## Number of networks for Boreal: 276
##
## === Top similar network pairs for Boreal ===
## Net1 Net2 Jaccard
## L23_2017 L23_2018 1.0000000
## L35_2017 L35_2018 1.0000000
## L39_2017 L39_2018 1.0000000
## L46_2017 L46_2018 1.0000000
## L13_2017 L13_2018 0.9729730
## L27_2019 L27_2020 0.9642857
## L28_2019 L28_2020 0.9500000
## L13_2018 L13_2019 0.9473684
## L28_2020 L28_2021 0.9473684
## L17_2017 L17_2018 0.9354839
## L17_2019 L17_2020 0.9354839
## L23_2017 L23_2019 0.9333333
## L23_2018 L23_2019 0.9333333
## L12_2017 L12_2018 0.9310345
## L13_2017 L13_2019 0.9230769
## L45_2017 L45_2018 0.9166667
## L31_2018 L31_2019 0.9130435
## L31_2017 L31_2018 0.9090909
## L35_2017 L35_2019 0.9090909
## L35_2018 L35_2019 0.9090909
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'

##
## === Processing Bioregion: Continental ===
## Number of networks for Continental: 341
##
## === Top similar network pairs for Continental ===
## Net1 Net2 Jaccard
## UNIPD03 CD23 UNIPD03 CD29 0.5733333
## UNIPD03 CD15 UNIPD03 CD16 0.5376344
## UNIPD03 CD27 UNIPD03 CD28 0.5301205
## UNIPD03 CD15 UNIPD03 CD17 0.5268817
## UNIPD03 CD20 UNIPD03 CD27 0.5256410
## UNIPD03 CD35 UNIPD03 D34 0.5230769
## UNIPD03 CD15 UNIPD03 CD22 0.5205479
## UNIPD03 CD16 UNIPD03 CD17 0.5100000
## UNIPD03 D17 UNIPD03 D29 0.5094340
## UNIPD03 CD15 UNIPD03 CD28 0.5054945
## UNIPD03 CD22 UNIPD03 CD23 0.5000000
## UNIPD03 D23 UNIPD03 D24 0.5000000
## UNIPD03 D24 UNIPD03 E35 0.5000000
## UNIPD03 E19 UNIPD03 E36 0.5000000
## UNIPD03 D29 UNIPD03 D35 0.4905660
## UNIPD03 CD16 UNIPD03 CD28 0.4897959
## UNIPD03 E14 UNIPD03 E26 0.4880952
## UNIPD03 D11 UNIPD03 D24 0.4833333
## UNIPD03 CD15 UNIPD03 CD20 0.4827586
## UNIPD03 D24 UNIPD03 E31 0.4821429
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'

##
## === Processing Bioregion: Alpine ===
## Number of networks for Alpine: 111
##
## === Top similar network pairs for Alpine ===
## Net1 Net2 Jaccard
## wmb4 wmb5 0.6440678
## t1 t5 0.6136364
## wmb1 wmb2 0.5974026
## ho4 wmb4 0.5937500
## ho4 wmb5 0.5781250
## h1 wma2 0.5593220
## h4 ho4 0.5555556
## h3 wma2 0.5538462
## h6 h7 0.5510204
## wma2 wma3 0.5322581
## h6 ho5 0.5306122
## h4 h5 0.5303030
## h7 ho5 0.5283019
## h1 wmb2 0.5277778
## h2 wma2 0.5270270
## h1 wmb1 0.5217391
## h7 wma5 0.5192308
## wmb1 wmb3 0.5180723
## h4 wmb5 0.5070423
## wmb2 wmb3 0.5057471
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'

##
## === Processing Bioregion: Mediterranean ===
## Number of networks for Mediterranean: 99
##
## === Top similar network pairs for Mediterranean ===
## Net1 Net2 Jaccard
## nevero_Encroached pasture_2011 penalara_Encroached pasture_2011 0.5520000
## nevero_Encroached pasture_2011 nevero_Pasture_2011 0.5378151
## nevero_Pasture_2011 penalara_Encroached pasture_2011 0.5217391
## penalara_Encroached pasture_2011 penalara_Pasture_2011 0.5181818
## nevero_Pasture_2011 penalara_Pasture_2011 0.5145631
## FRA1OP FRA2OP 0.5000000
## Canal Menajo 0.4800000
## Canal Pinar 0.4769231
## Menajo Redondela 0.4736842
## MED3CA MED4CA 0.4693878
## FRA2OP MIQ2OP 0.4651163
## BAT1CA BAT2CA 0.4561404
## Menajo Pinar 0.4545455
## MED2CA MED4CA 0.4500000
## Cartaya Pinar 0.4487179
## nevero_Encroached pasture_2011 penalara_Pasture_2011 0.4462810
## Curva Pinar 0.4459459
## Cetrero Pinar 0.4457831
## Barca Pinar 0.4444444
## Bois_de_Fontaret_2016 Fourches_2016 0.4424242
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'

##
## === Processing Bioregion: Steppic ===
## Number of networks for Steppic: 1
## No valid Jaccard similarities for Steppic (empty or all NA).
##
## === Processing Bioregion: Pannonian ===
## Number of networks for Pannonian: 4
##
## === Top similar network pairs for Pannonian ===
## Net1 Net2 Jaccard
## 2 5 0.14285714
## 17 5 0.07692308
## 49 5 0.04545455
## 2 49 0.02857143
## 17 49 0.02702703
## 17 2 0.00000000
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'

